{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "## 2.5 处理缺失数据"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### 1.NumPy中的缺失数据"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:48.342842Z",
     "end_time": "2024-05-08T21:16:48.874154Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [
    {
     "data": {
      "text/plain": "(dtype('O'), dtype('int32'))"
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = np.array([1, None, 5, 7, 9])\n",
    "b = np.array([1, 3, 5, 7, 9])\n",
    "a.dtype, b.dtype"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:48.876960Z",
     "end_time": "2024-05-08T21:16:48.920831Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [
    {
     "data": {
      "text/plain": "dtype('float64')"
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c = np.array([1, np.nan, 5, 7, 9])\n",
    "c.dtype"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:48.900315Z",
     "end_time": "2024-05-08T21:16:48.929398Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "data": {
      "text/plain": "nan"
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "7 + np.nan"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:48.922436Z",
     "end_time": "2024-05-08T21:16:48.995874Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "data": {
      "text/plain": "nan"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "0 * np.nan"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:48.949625Z",
     "end_time": "2024-05-08T21:16:48.999435Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [
    {
     "data": {
      "text/plain": "nan"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c.sum()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:48.969680Z",
     "end_time": "2024-05-08T21:16:48.999435Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [
    {
     "data": {
      "text/plain": "22.0"
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.nansum(c)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:48.999435Z",
     "end_time": "2024-05-08T21:16:49.020858Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "data": {
      "text/plain": "5.5"
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.nanmean(c)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.020858Z",
     "end_time": "2024-05-08T21:16:49.085557Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### 2.Pandas处理缺失数据"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [
    {
     "data": {
      "text/plain": "0    1.0\n1    NaN\n2    3.0\n3    NaN\n4    7.0\n5    9.0\ndtype: float64"
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series([1, np.nan, 3, None, 7, 9])\n",
    "s"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.030937Z",
     "end_time": "2024-05-08T21:16:49.091211Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "data": {
      "text/plain": "20.0"
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.sum()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.048293Z",
     "end_time": "2024-05-08T21:16:49.091211Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [
    {
     "data": {
      "text/plain": "0    NaN\n1    NaN\n2    3.0\n3    NaN\n4    7.0\n5    9.0\ndtype: float64"
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s[0] = None\n",
    "s"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.088213Z",
     "end_time": "2024-05-08T21:16:49.135948Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [
    {
     "data": {
      "text/plain": "0     True\n1     True\n2    False\n3     True\n4    False\n5    False\ndtype: bool"
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.isnull()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.119564Z",
     "end_time": "2024-05-08T21:16:49.135948Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [
    {
     "data": {
      "text/plain": "0    False\n1    False\n2     True\n3    False\n4     True\n5     True\ndtype: bool"
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.notnull()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.127974Z",
     "end_time": "2024-05-08T21:16:49.139589Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [
    {
     "data": {
      "text/plain": "2    3.0\n4    7.0\n5    9.0\ndtype: float64"
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s[s.notnull()]"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.139589Z",
     "end_time": "2024-05-08T21:16:49.179912Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [
    {
     "data": {
      "text/plain": "2    3.0\n4    7.0\n5    9.0\ndtype: float64"
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.dropna()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.168902Z",
     "end_time": "2024-05-08T21:16:49.179912Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "outputs": [
    {
     "data": {
      "text/plain": "0    NaN\n1    NaN\n2    3.0\n3    NaN\n4    7.0\n5    9.0\ndtype: float64"
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.176840Z",
     "end_time": "2024-05-08T21:16:49.195630Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "outputs": [
    {
     "data": {
      "text/plain": "   0     1     2   3\n0  1   2.0   NaN   4\n1  5   NaN   7.0   8\n2  9  10.0  11.0  12",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n      <th>3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>2.0</td>\n      <td>NaN</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>5</td>\n      <td>NaN</td>\n      <td>7.0</td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>9</td>\n      <td>10.0</td>\n      <td>11.0</td>\n      <td>12</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([[1, 2, np.nan, 4], [5, None, 7, 8], [9, 10, 11, 12]])\n",
    "df"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.200423Z",
     "end_time": "2024-05-08T21:16:49.228504Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "outputs": [
    {
     "data": {
      "text/plain": "   0     1     2   3\n2  9  10.0  11.0  12",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n      <th>3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>2</th>\n      <td>9</td>\n      <td>10.0</td>\n      <td>11.0</td>\n      <td>12</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dropna()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.217094Z",
     "end_time": "2024-05-08T21:16:49.280857Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "outputs": [
    {
     "data": {
      "text/plain": "   0   3\n0  1   4\n1  5   8\n2  9  12",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>5</td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>9</td>\n      <td>12</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dropna(axis=1)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.247063Z",
     "end_time": "2024-05-08T21:16:49.310033Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "outputs": [
    {
     "data": {
      "text/plain": "   0   3\n0  1   4\n1  5   8\n2  9  12",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>5</td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>9</td>\n      <td>12</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dropna(axis='columns')"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.273093Z",
     "end_time": "2024-05-08T21:16:49.374504Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "outputs": [
    {
     "data": {
      "text/plain": "   0      1      2   3\n0  1    2.0 -999.0   4\n1  5 -999.0    7.0   8\n2  9   10.0   11.0  12",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n      <th>3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>2.0</td>\n      <td>-999.0</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>5</td>\n      <td>-999.0</td>\n      <td>7.0</td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>9</td>\n      <td>10.0</td>\n      <td>11.0</td>\n      <td>12</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.fillna(-999)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.299918Z",
     "end_time": "2024-05-08T21:16:49.453216Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "outputs": [
    {
     "data": {
      "text/plain": "   0     1     2   3\n0  1   2.0   NaN   4\n1  5   NaN   7.0   8\n2  9  10.0  11.0  12",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n      <th>3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>2.0</td>\n      <td>NaN</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>5</td>\n      <td>NaN</td>\n      <td>7.0</td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>9</td>\n      <td>10.0</td>\n      <td>11.0</td>\n      <td>12</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.330435Z",
     "end_time": "2024-05-08T21:16:49.499495Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "outputs": [
    {
     "data": {
      "text/plain": "0    NaN\n1    NaN\n2    3.0\n3    3.0\n4    7.0\n5    9.0\ndtype: float64"
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.ffill()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.352498Z",
     "end_time": "2024-05-08T21:16:49.499994Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "outputs": [
    {
     "data": {
      "text/plain": "0    3.0\n1    3.0\n2    3.0\n3    7.0\n4    7.0\n5    9.0\ndtype: float64"
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.bfill()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.382454Z",
     "end_time": "2024-05-08T21:16:49.517421Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "outputs": [
    {
     "data": {
      "text/plain": "     0     1     2     3\n0  1.0   2.0   2.0   4.0\n1  5.0   5.0   7.0   8.0\n2  9.0  10.0  11.0  12.0",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n      <th>3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1.0</td>\n      <td>2.0</td>\n      <td>2.0</td>\n      <td>4.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>5.0</td>\n      <td>5.0</td>\n      <td>7.0</td>\n      <td>8.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>9.0</td>\n      <td>10.0</td>\n      <td>11.0</td>\n      <td>12.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.ffill(axis='columns')"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.406099Z",
     "end_time": "2024-05-08T21:16:49.517421Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "outputs": [
    {
     "data": {
      "text/plain": "   0     1     2   3\n0  1   2.0   NaN   4\n1  5   NaN   7.0   8\n2  9  10.0  11.0  12",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n      <th>3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>2.0</td>\n      <td>NaN</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>5</td>\n      <td>NaN</td>\n      <td>7.0</td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>9</td>\n      <td>10.0</td>\n      <td>11.0</td>\n      <td>12</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.444054Z",
     "end_time": "2024-05-08T21:16:49.564607Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2024-05-08T21:16:49.467005Z",
     "end_time": "2024-05-08T21:16:49.680182Z"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
